import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
%matplotlib inline
import datetime
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn_pandas import DataFrameMapper
data=pd.read_table("C:/Users/Sedki/Desktop/ML/lab3/Lab Police d'Assurance sur Caravane/AssurancExpertsInc.txt")
data['CLASS'] =data['CLASS'].map({'Yes':1, 'No':0})
train=data.drop(['STATUS'],1)
Train=data.loc[data['STATUS'] == 'Learning']
yesTrain=Train.loc[data['CLASS'] == 1]
noTrain=Train.loc[data['CLASS'] == 0]
Test=data.loc[data['STATUS'] == 'Test']
noTest=Test.loc[data['CLASS'] == 0]
yesTest=Test.loc[data['CLASS'] == 1]
print("no",len(noTrain))
print("yes",len(yesTrain))
print('Proportion:', round(len(noTrain) / len(yesTrain)))
train.columns
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
test=data.iloc[:,0:85]
# Separating out the target
y_target= data.loc[:,['CLASS']].values
pca.fit(test)
pca_samples = pca.transform(test)
ps = pd.DataFrame(pca_samples)
ps.head()
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
tocluster = pd.DataFrame(ps[[3,0]])
tocluster.head()
print (tocluster.shape)
print (tocluster.head())
fig = plt.figure(figsize=(8,8))
plt.plot(tocluster[3], tocluster[0], 'o', markersize=3, color='blue', alpha=0.5, label='class1')
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.legend()
plt.show()
On a projeté plusieurs paires afin de chercher celui qui convient à un cluster de KMeans. On a choisi la paire (PC3, PC0). Puisque chaque composant est la projection de tous les points de l'ensemble de données original donc chaque composant est représentatif de l'ensemble de données.
finalDf = pd.concat([tocluster, data[['CLASS']]], axis = 1)
finalDf.head()
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 0', fontsize = 15)
ax.set_ylabel('Principal Component 3', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g']
for i in range(0,len(finalDf)):
if (finalDf.loc[i,'CLASS']==0):
ax.scatter(finalDf.loc[i,3] , finalDf.loc[i,0], c = 'g', s = 50)
if (finalDf.loc[i,'CLASS']==1):
ax.scatter(finalDf.loc[i,3] , finalDf.loc[i,0], c = 'r', s = 50)
#ax.legend(targets)
ax.grid()
Par cette projection on remarque des groupements des individus et chaque groupe represente un ensemble de caracteristiques communes, et par l'habillage selon la colonne 'CLASS' on remarque la superposition de deux reponses , alors on peut constater que les gens qui ont repondu 'Yes' sur le questionnaire (qui sont interessés par l'assurance caravane ) n'ont pas obligatoirement un profil qui repond reellement à l'assurance caravane et inversement pour les gens qui ont repondu 'No'.
On passe pour la segmentation pour caracteriser chaque groupe .
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
clusterer = KMeans(n_clusters=3,random_state=42).fit(tocluster)
centers = clusterer.cluster_centers_
c_preds = clusterer.predict(tocluster)
print(centers)
print (c_preds[0:100])
import matplotlib
fig = plt.figure(figsize=(8,8))
colors = ['orange','blue','purple','green']
colored = [colors[k] for k in c_preds]
#print (colored[0:10])
plt.scatter(tocluster[3],tocluster[0], color = colored)
for ci,c in enumerate(centers):
plt.plot(c[0], c[1], 'o', markersize=8, color='red', alpha=0.9, label=''+str(ci))
plt.xlabel('x_values')
plt.ylabel('y_values')
#plt.xlabel('y_values')
#plt.ylabel('x_values')
plt.legend()
plt.show()
clust_prod = test.copy()
clust_prod['cluster'] = c_preds
clust_prod.head()
df1=clust_prod [ clust_prod.cluster == 0 ]
print(len(df1))
df1.head()
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df1.SD1,data=df1, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer sub type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer sub type", fontsize=15)
plt.show()
(Students in apartments,Fresh masters in the city,Single youth,Suburban youth,Etnically diverse,Young urban have-nots,Mixed apartment dwellers,Young and rising,Young and low educated,Young seniors in the city,Own home elderly,Seniors in apartments,Residential elderly,Porchless seniors: no front yard,Religious elderly singles)
la plus part des individus sont de type 22 ,23 et 24 .Donc ce cluster se caracterise par des customers qui sont jeunes aussi par les gens qui vivrent seule.
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df1.SD5,data=df1, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer main type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer main type", fontsize=15)
plt.show()
(Career Loners,Living well,Cruising Seniors,Retired and Religeous)
ce cluster represente les gens qui vivent bien,les gens qui sont interessés par les croisières aussi les personnes retraités.
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df1.SD43,data=df1, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('purchasing power class', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer purchasing power class", fontsize=15)
plt.show()
df2=clust_prod [ clust_prod.cluster == 1 ]
print(len(df2))
df2.head()
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df2.SD1,data=df2, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer sub type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer sub type", fontsize=15)
plt.show()
(High Income, expensive child and Very Important Provincials,High status seniors,Affluent senior apartments,Mixed seniors,Career and childcare,Dinki's (double income no kids),Middle class families,Modern and complete families,Stable family,Family starters ,Affluent young families,Young all american family,Junior cosmopolitan,Senior cosmopolitans,Students in apartments)
ce cluster se caracterise par les familles qui sont stable, de classe moyenne,moderne aussi les nouveaux couples ...
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df2.SD5,data=df2, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer main type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer main type", fontsize=15)
plt.show()
(Successful hedonists,Driven Growers,Average Family,Career Loners(negligable))
ce cluster contient les familles moyennes , les personnes hedonistes et les gens qui sont motivés par l'agriculture.
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df2.SD43,data=df2, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('purchasing power class', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer purchasing power class", fontsize=15)
plt.show()
df3=clust_prod [ clust_prod.cluster == 2 ]
print(len(df3))
df3.head()
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df3.SD1,data=df3, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer sub type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer sub type", fontsize=15)
plt.show()
(Porchless seniors: no front yard,Religious elderly singles,Low income catholics,Mixed seniors,Lower class large families,Large family and employed child,Village families,Couples with teens 'Married with children',Mixed small town dwellers,Traditional families,Large religous families,Large family farms,Mixed rurals)
ce cluster se caracterise par les nombreuses familles avec classe base aussi les familles traditionnelles
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df3.SD5,data=df3, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('customer main type', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer main type", fontsize=15)
plt.show()
( Retired and Religeous,Family with grown ups,Conservative families,Farmers)
ce cluster contient les familles larges et conservatives , les retraités aussi les agriculteurs.
plt.figure(figsize=(12,20))
plt.subplot(413)
sns.countplot(x=df3.SD43,data=df3, palette='Set3')
plt.ylabel('Count', fontsize=15)
plt.xlabel('purchasing power class', fontsize=15)
plt.xticks(rotation='vertical')
plt.title("Classifying customer purchasing power class", fontsize=15)
plt.show()
To get good results with this dataset we have to deal with 2 big challenges:
In order to solve the first issue we tried several techniques of features selection in order to reduce the complexity of the models,reduce the noise and optimize the performence.
To deal with the second problem of the great disparity between classes:
print("no",len(noTest))
print("yes",len(yesTest))
print('Proportion:', round(len(noTest) / len(yesTest)))
X_train ,X_test , Y_train , Y_test =train_test_split(train.drop(['CLASS'],1),train['CLASS'],test_size=0.25)
print(Y_train.mean())
from imblearn.combine import SMOTETomek
sm = SMOTETomek()
X_Train, Y_Train = sm.fit_sample(X_train, Y_train)
print(len(Y_Train))
print(Y_Train.mean())
In the following part,we tried several classification algorithms with and withou feature selection and then we compared them using the ROC curve in order to identify the best candidate.
log_r = LogisticRegression()
log_r.fit(X_Train, Y_Train)
log_r.score(X_test, Y_test)
predictions=log_r.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprR, tprR, thresholdsR = metrics.roc_curve(Y_test, log_r.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprR, tprR, color='darkorange',
lw=lw, label='ROC log reg (area = %0.2f)' % auc(fprR, tprR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
X_train=X_Train.loc[:,'SD1':'PO64']
X_test=X_test.loc[:,'SD1':'PO64']
It selects the features by recursively considering smaller and smaller sets of features. For the linear model, it is based on the value of the coefficients (the lowest one in absolute value is removed). The process continues until we reach the desired number of features.
Goal:
Detecting the subset of relevant features for an enhanced generalization performance .
from sklearn.feature_selection import RFE
log_lm = LogisticRegression()
selecteur = RFE(estimator=log_lm)
#launch the selection process
sol = selecteur.fit(X_train,Y_train)
#number of selected attributes
print(sol.n_features_)
#list of selected features
print(sol.support_)
# order of deletion
print(sol.ranking_)
# matrix for the selected attributes -training set
# we use the boolean vector sol.support_
#.iloc[:,4].values
log_lm = LogisticRegression()
X_new_app = X_Train[:,sol.support_]
X_new_test = X_test.iloc[:,sol.support_]
print(X_new_app.shape) # (468, 4) 4 variables restantes
# fit the model on the selected attributes
modele_sel = log_lm.fit(X_new_app, Y_Train)
# matrix for the selected attributes – test set
X_new_test = X_test.iloc[:,sol.support_].values
print(X_new_test.shape) # (300, 4)
# prediction on the test set
y_pred_sel = modele_sel.predict(X_new_test)
# success rate
print(metrics.accuracy_score(Y_test,y_pred_sel)) # 0.787
print(metrics.classification_report(y_pred_sel, Y_test))
mat = confusion_matrix(Y_test, y_pred_sel)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprlR, tprlR, thresholdslR = metrics.roc_curve(Y_test, log_lm.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprlR, tprlR, color='darkorange',
lw=lw, label='ROC log reg with fs (area = %0.2f)' % auc(fprlR, tprlR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd1 = GradientBoostingClassifier()
grd1.fit(X_Train,Y_Train)
grd1.score(X_test, Y_test)
predictions=grd1.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd1, tprgrd1, thresholdsgrd1 = metrics.roc_curve(Y_test, grd1.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd1, tprgrd1, color='darkorange',
lw=lw, label='ROC grd without fs (area = %0.2f)' % auc(fprgrd1, tprgrd1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd = GradientBoostingClassifier()
grd.fit(X_new_app,Y_Train)
grd.score(X_new_test, Y_test)
predictions=grd.predict(X_new_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd, tprgrd, thresholdsgrd = metrics.roc_curve(Y_test, grd.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd, tprgrd, color='darkorange',
lw=lw, label='ROC grd with fs (area = %0.2f)' % auc(fprgrd, tprgrd))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
from sklearn.neighbors import KNeighborsClassifier
training_accuracy =[]
test_accuracy=[]
neighbors_settings=range(1,5)
plt.figure(figsize=(16, 10))
for n_neighbors in neighbors_settings :
clf=KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X_new_app,Y_Train)
training_accuracy.append(clf.score(X_new_app,Y_Train))
test_accuracy.append(clf.score(X_new_test,Y_test))
plt.plot(neighbors_settings,training_accuracy,label='Accuracy of the training set')
plt.plot(neighbors_settings,test_accuracy,label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()
knn=KNeighborsClassifier(2)
knn.fit(X_new_app,Y_Train)
print('Accuracy of KNN n=2 , on the training set : {:.3f}'.format(knn.score(X_new_app,Y_Train)))
print('Accuracy of KNN n=2 , on the test set :{:.3f}'.format(knn.score(X_new_test,Y_test)))
predictions = knn.predict(X_new_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
training_accuracy =[]
test_accuracy=[]
neighbors_settings=range(1,4)
plt.figure(figsize=(16, 10))
for n_neighbors in neighbors_settings :
clf=KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X_Train,Y_Train)
training_accuracy.append(clf.score(X_Train,Y_Train))
test_accuracy.append(clf.score(X_test,Y_test))
plt.plot(neighbors_settings,training_accuracy,label='Accuracy of the training set')
plt.plot(neighbors_settings,test_accuracy,label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()
knn1=KNeighborsClassifier(2)
knn1.fit(X_Train,Y_Train)
print('Accuracy of KNN n=2 , on the training set : {:.3f}'.format(knn1.score(X_Train,Y_Train)))
print('Accuracy of KNN n=2 , on the test set :{:.3f}'.format(knn1.score(X_test,Y_test)))
predictions = knn1.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprknn1, tprknn1, thresholdsknn1 = metrics.roc_curve(Y_test, knn1.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprknn1, tprknn1, color='darkorange',
lw=lw, label='ROC knn without fs (area = %0.2f)' % auc(fprknn1, tprknn1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
fprknn, tprknn, thresholdsknn = metrics.roc_curve(Y_test, knn.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprknn, tprknn, color='darkorange',
lw=lw, label='ROC knn with fs (area = %0.2f)' % auc(fprknn, tprknn))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
rf_gs1=RandomForestClassifier(n_estimators=10)
#parameters = {'max_depth':[5, 20], 'n_estimators':[10,300]}
#rf_gs1 = GridSearchCV(rf, parameters)
#rf_gs1.fit(X_Train, Y_Train)
rf_gs1.fit(X_Train, Y_Train)
print(rf_gs1.score(X_test, Y_test))
predictions=rf_gs1.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprrf1, tprrf1, thresholdsrf1 = metrics.roc_curve(Y_test, rf_gs1.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprrf1, tprrf1, color='darkorange',
lw=lw, label='ROC rf without fs (area = %0.2f)' % auc(fprrf1, tprrf1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
rf=RandomForestClassifier()
parameters = {'max_depth':[5, 25], 'n_estimators':[5,400]}
rf_gs = GridSearchCV(rf, parameters)
rf_gs.fit(X_new_app, Y_Train)
print(rf_gs.score(X_new_test, Y_test))
predictions=rf_gs.predict(X_new_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprrf, tprrf, thresholdsrf = metrics.roc_curve(Y_test, rf_gs.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprrf, tprrf, color='darkorange',
lw=lw, label='ROC rf with fs (area = %0.2f)' % auc(fprrf, tprrf))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
grdc = GradientBoostingClassifier()
gbHyperParams = {'loss' : ['deviance', 'exponential'],
'n_estimators': randint(10, 100),
'max_depth': randint(10,40)}
gridSearchGB = RandomizedSearchCV(estimator=grdc, param_distributions=gbHyperParams, n_iter=10,
scoring='roc_auc', verbose=4).fit(X_Train, Y_Train)
gridSearchGB.best_params_, gridSearchGB.best_score_
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
hidden_layer_sizes=(4, 2), random_state=1)
clf.fit(X_Train, Y_Train)
print(clf.score(X_test, Y_test))
predictions=clf.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprnn, tprnn, thresholdsnn = metrics.roc_curve(Y_test, clf.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprnn, tprnn, color='green',
lw=lw, label='ROC nn without fs (area = %0.2f)' % auc(fprnn, tprnn))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
from sklearn.ensemble import VotingClassifier
clf_voting = VotingClassifier(
estimators=[('lr', log_r), ('grd', grd1),('rf', rf_gs)],
voting='soft')
clf_voting.fit(X_Train, Y_Train)
print(clf_voting.score(X_test, Y_test))
predictions=clf_voting.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprnn, tprnn, thresholdsnn = metrics.roc_curve(Y_test, clf_voting.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprnn, tprnn, color='green',
lw=lw, label='ROC nn without fs (area = %0.2f)' % auc(fprnn, tprnn))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprrf, tprrf, color='darkorange',
lw=lw, label='ROC rf with fs (area = %0.2f)' % auc(fprrf, tprrf))
plt.plot(fprknn, tprknn, color='black',
lw=lw, label='ROC knn with fs (area = %0.2f)' % auc(fprknn, tprknn))
plt.plot(fprgrd, tprgrd, color='red',
lw=lw, label='ROC grd with fs (area = %0.2f)' % auc(fprgrd, tprgrd))
plt.plot(fprlR, tprlR, color='green',
lw=lw, label='ROC log reg with fs (area = %0.2f)' % auc(fprlR, tprlR))
plt.plot(fprnn, tprnn, color='yellow',
lw=lw, label='ROC voting classifier (area = %0.2f)' % auc(fprnn, tprnn))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
These are the roc curves of all the previous models with features selection.
It shows that the best models in identifying our target customer are the:
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprrf, tprrf, color='darkorange',
lw=lw, label='ROC rf without fs (area = %0.2f)' % auc(fprrf1, tprrf1))
plt.plot(fprknn, tprknn, color='black',
lw=lw, label='ROC knn without fs (area = %0.2f)' % auc(fprknn1, tprknn1))
plt.plot(fprgrd, tprgrd, color='red',
lw=lw, label='ROC grd without fs (area = %0.2f)' % auc(fprgrd1, tprgrd1))
plt.plot(fprlR, tprlR, color='green',
lw=lw, label='ROC log reg without fs (area = %0.2f)' % auc(fprR, tprR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
These are the roc curves of all the previous models without features selection.
It shows that the best model in identifying our target customer is by far:
def get_rappel_taille(x):
#calculate the posterior probabilities for the test sample
probas = x.predict_proba(X_test)
#score for 'presence‘ (positive class value)
score = probas[:,1] # [0.86238322 0.21334963 0.15895063 …]
#transforming in 0/1 (dummy variables) the Y_test vector
pos = pd.get_dummies(Y_test).as_matrix()
#get the second column (index = 1)
pos = pos[:,1] # [ 1 0 0 1 0 0 1 1 …]
#number of “positive” instances
npos = np.sum(pos) # 99 – there are 99 ‘’positive’’ instances into the test set
#indices that would sort according to the score
index = np.argsort(score) # [ 55 45 265 261 … 11 255 159]
#invert the indices, first the instances with the highest score
index = index[::-1] # [ 159 255 11 … 261 265 45 55 ]
#sort the class membership according to the indices
sort_pos= pos[index] # [ 1 1 1 1 1 0 1 1 …]
#cumulated sum
cpos = np.cumsum(sort_pos) # [ 1 2 3 4 5 5 6 7 … 99]
#recall column
rappel2 = cpos/npos # [ 1/99 2/99 3/99 4/99 5/99 5/99 6/99 7/99 … 99/99]
#nb. of instances into the test set
n = Y_test.shape[0] # 300, ily a 300 ind. dansl’éch. test
#target size
taille2= np.arange(start=1,stop=2457,step=1) # [1 2 3 4 5 … 300]
#target size in percentage
taille2= taille2/ n # [ 1/300 2/300 3/300 … 300/300 ]
return taille2,rappel2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
print("Initial shape: ", X_Train.shape)
clf = ExtraTreesClassifier()
clf = clf.fit(X_Train, Y_Train)
model = SelectFromModel(clf, prefit=True)
X_new1 = model.transform(X_Train)
X_new_test1=model.transform(X_test)
print("New shape: ", X_new1.shape)
features_name=list(X_train)
print(sorted(zip(clf.feature_importances_, features_name), reverse=True))
log_r = LogisticRegression()
log_r.fit(X_new1, Y_Train)
log_r.score(X_new_test1, Y_test)
predictions=log_r.predict(X_new_test1)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprR, tprR, thresholdsR = metrics.roc_curve(Y_test, log_r.predict_proba(X_new_test1)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprR, tprR, color='darkorange',
lw=lw, label='ROC log reg (area = %0.2f)' % auc(fprR, tprR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd1 = GradientBoostingClassifier()
grd1.fit(X_new1,Y_Train)
grd1.score(X_new_test1, Y_test)
predictions=grd1.predict(X_new_test1)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd1, tprgrd1, thresholdsgrd1 = metrics.roc_curve(Y_test, grd1.predict_proba(X_new_test1)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd1, tprgrd1, color='darkorange',
lw=lw, label='ROC grd without tree based fs (area = %0.2f)' % auc(fprgrd1, tprgrd1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(max_depth = 3, random_state = 42)
clf_dt.fit(X_new1, Y_Train)
clf_dt.score(X_new_test1, Y_test)
predictions=clf_dt.predict(X_new_test1)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprdt, tprdt, thresholdsdt = metrics.roc_curve(Y_test, clf_dt.predict_proba(X_new_test1)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprdt, tprdt, color='darkorange',
lw=lw, label='ROC decision tree with tree based fs (area = %0.2f)' % auc(fprdt, tprdt))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprdt, tprdt, color='darkorange',
lw=lw, label='ROC decision tree with tree based fs (area = %0.2f)' % auc(fprdt, tprdt))
plt.plot(fprgrd, tprgrd, color='red',
lw=lw, label='ROC grd with tree based fs (area = %0.2f)' % auc(fprgrd1, tprgrd1))
plt.plot(fprlR, tprlR, color='green',
lw=lw, label='ROC log reg with tree based fs (area = %0.2f)' % auc(fprR, tprR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
These are the roc curves of all the previous models with tree based features selection.
The gradient boosting classifier is still the best.
Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
KBest = SelectKBest(chi2, k=30)
X_new2 = KBest.fit_transform(X_Train, Y_Train)
X_new_test2 = KBest.fit_transform(X_test, Y_test)
print(sorted(zip(KBest.scores_, features_name), reverse=True))
print(X_new_test.shape)
log_r = LogisticRegression()
log_r.fit(X_new2, Y_Train)
log_r.score(X_new_test2, Y_test)
predictions=log_r.predict(X_new_test2)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprR, tprR, thresholdsR = metrics.roc_curve(Y_test, log_r.predict_proba(X_new_test2)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprR, tprR, color='darkorange',
lw=lw, label='ROC log reg with univariate feature selection (area = %0.2f)' % auc(fprR, tprR))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd1 = GradientBoostingClassifier()
grd1.fit(X_new2,Y_Train)
grd1.score(X_new_test2, Y_test)
predictions=grd1.predict(X_new_test2)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd1, tprgrd1, thresholdsgrd1 = metrics.roc_curve(Y_test, grd1.predict_proba(X_new_test2)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd1, tprgrd1, color='darkorange',
lw=lw, label='ROC grd without tree based fs (area = %0.2f)' % auc(fprgrd1, tprgrd1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
The results when using these set of features are not good.
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(ratio=0.3)
X_Train, Y_Train = rus.fit_sample(X_train, Y_train)
print(len(Y_Train))
print(Y_Train.mean())
log_r = LogisticRegression()
log_r.fit(X_Train, Y_Train)
log_r.score(X_test, Y_test)
predictions=log_r.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprr, tprr, thresholdsr = metrics.roc_curve(Y_test, log_r.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprr, tprr, color='darkorange',
lw=lw, label='ROC log_r without fs (area = %0.2f)' % auc(fprr, tprr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd1 = GradientBoostingClassifier()
grd1.fit(X_Train,Y_Train)
grd1.score(X_test, Y_test)
predictions=grd1.predict(X_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd, tprgrd, thresholdsgrd = metrics.roc_curve(Y_test, grd1.predict_proba(X_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd, tprgrd, color='darkorange',
lw=lw, label='ROC grd without feature selection (area = %0.2f)' % auc(fprgrd, tprgrd))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
log_lm = LogisticRegression()
X_new_app = X_Train[:,sol.support_]
X_new_test = X_test.iloc[:,sol.support_]
print(X_new_app.shape) # (468, 4) 4 variables restantes
# fit the model on the selected attributes
modele_sel = log_lm.fit(X_new_app, Y_Train)
# matrix for the selected attributes – test set
X_new_test = X_test.iloc[:,sol.support_].values
print(X_new_test.shape) # (300, 4)
# prediction on the test set
y_pred_sel = modele_sel.predict(X_new_test)
# success rate
print(metrics.accuracy_score(Y_test,y_pred_sel)) # 0.787
print(metrics.classification_report(y_pred_sel, Y_test))
mat = confusion_matrix(Y_test, y_pred_sel)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprR1, tprR1, thresholdsR1 = metrics.roc_curve(Y_test, log_lm.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprR1, tprR1, color='darkorange',
lw=lw, label='ROC log_r with feature selection (area = %0.2f)' % auc(fprR1, tprR1))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
grd4 = GradientBoostingClassifier()
grd4.fit(X_new_app,Y_Train)
grd4.score(X_new_test, Y_test)
predictions=grd4.predict(X_new_test)
print(metrics.classification_report(predictions, Y_test))
mat = confusion_matrix(Y_test, predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
fprgrd4, tprgrd4, thresholdsgrd4 = metrics.roc_curve(Y_test, grd4.predict_proba(X_new_test)[:,1])
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd4, tprgrd4, color='darkorange',
lw=lw, label='ROC grd with feature selection (area = %0.2f)' % auc(fprgrd4, tprgrd4))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
plt.figure(figsize=(18,12))
lw = 2.5
plt.plot(fprgrd, tprgrd, color='darkorange',
lw=lw, label='ROC grd without fs (area = %0.2f)' % auc(fprgrd, tprgrd))
plt.plot(fprgrd4, tprgrd4, color='red',
lw=lw, label='ROC grd with fs (area = %0.2f)' % auc(fprgrd4, tprgrd4))
plt.plot(fprR1, tprR1, color='green',
lw=lw, label='ROC log reg with fs (area = %0.2f)' % auc(fprR1, tprR1))
plt.plot(fprr, tprr, color='green',
lw=lw, label='ROC log reg without fs (area = %0.2f)' % auc(fprr, tprr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
These are the best models in predicting our wanted customers.
The gradient boosting classifier in still the best model.
#graphical representation
#title and axis labels
a,b=get_rappel_taille(grd1)
c,d=get_rappel_taille(rf_gs1)
e,f=get_rappel_taille(log_r)
g,h=get_rappel_taille(clf_voting)
plt.figure(figsize=(18,12))
lw = 2.5
plt.title('Courbe de gain')
plt.xlabel('Taille de cible')
plt.ylabel('Rappel')
#limits in horizontal and vertical axes
plt.xlim(0,1)
plt.ylim(0,1)
#tricks to represent the diagonal
#plt.scatter(taille,taille,marker='.',color='blue')
#gains curve
#plt.scatter(taille,rappel,marker='.',color='red')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.plot(a,b,marker='.',color='red',lw=lw, label='grd')
plt.plot(c,d,marker='.',color='green',lw=lw, label='rf')
plt.plot(e,f,marker='.',color='m',lw=lw, label='log_reg')
plt.plot(g,h,marker='.',color='black',lw=lw, label='voting classifier')
plt.legend(loc="lower right")
#show the chart
plt.show()
The x-coordinate of the chart shows the percentage of the cumulative number of sorted data records according to the decreasing score value.
The y-coordinate shows the percentage of the number of records that actually contain the selected target field value for the appropriate amount of records on the x-coordinate.
All models are by far better than random guess but the gradient boosting classifier is the best.
Contact the fewest people, get the max of positive responses.
scores=pd.read_csv("scores.csv",sep=';')
scores2=pd.read_csv("scores2.csv",sep=';')
scores.head()
scores = pd.concat([scores, scores2], ignore_index=True)
len(scores)
scores['score']=scores['LD1']
scores=scores.drop(['LD1'],1)
scores['score'] = scores.score.apply(lambda x: str(x))
scores['score'] = scores.score.apply(lambda x: x[0:6])
scores['score'] = scores.score.apply(lambda x: x.replace(",", "."))
scores['score'] = scores.score.apply(lambda x: float(x))
scores.head()
scoreClass=[]
for e in scores.score:
if(e>1.5):
scoreClass.append(1)
else:
scoreClass.append(0)
scores['scoreClass']=scoreClass
scores.head()
The threshold used is 1.5, it was extracted graphicaly from the lda plot and will be confirmed to be accurate in the following.
yes = scores.loc[scores['scoreClass'] == 1]
no = scores.loc[scores['scoreClass'] == 0]
yesyes = yes.loc[yes['CLASS'] == 1]
allyes=scores.loc[scores['CLASS'] == 1]
yesno = no.loc[no['CLASS'] == 1]
print('')
print("- Proportion of clients that have minimum score of 1.5 : ",round(len(yes)/len(scores),3))
print('')
print("-",round(len(yesyes)/len(yes),2)*100,"% of clients that have minimum score of 1.5 are actualy intersted in the product")
print('')
print("- Targeting only ",round(100*len(yes)/len(scores),2),'% of the clients we can select',
round(len(yesyes)/len(allyes),2)*100 , "% of all interested clients")
These results are very satisfying since we can target only a small fraction of the clients: 6% and select 27% of all potenitial interested ones
This can be of a huge help to the insurance company since it will save them money and prevent them from disturbing uninterested customers.
fig = plt.figure(figsize=(16,8))
ax = sns.countplot(x='scoreClass',hue='CLASS',data=scores)
These variables are the ones identifyed to be the most important by LDA and the ones that have the most variance among the two classes.
scores.columns
from pandas.plotting import parallel_coordinates
x = scores.loc[:,'SD31':'SD38']
x.columns=[ 'Home owners','National_Health','Private_health',
'Lower_level_education','Married','Income<30.000','1 car','Income 30-45.000']
#delta['cluster']=cluster_labels
y=scores["scoreClass"]
X_norm = (x - x.min())/(x.max() - x.min())
# Select features to include in the plot
plot_feat = ['Home owners','National_Health','Private_health',
'Lower_level_education','Married','Income<30.000','1 car','Income 30-45.000']
# Concat classes with the normalized data
data_norm = pd.concat([X_norm[plot_feat], y], axis=1)
plt.figure(figsize=(15,8))
# Perform parallel coordinate plot
parallel_coordinates(data_norm, 'scoreClass',color=['darkorange','black'])
plt.show()
x = scores.loc[:,'SD12':'SD14']
x.columns=[ 'Other relation','Middle management','Average income',
'2 cars','Income 45-75.000','Household without children']
#delta['cluster']=cluster_labels
y=scores["scoreClass"]
X_norm = (x - x.min())/(x.max() - x.min())
# Select features to include in the plot
plot_feat = ['Other relation','Middle management','Average income',
'2 cars','Income 45-75.000','Household without children']
# Concat classes with the normalized data
data_norm = pd.concat([X_norm[plot_feat], y], axis=1)
plt.figure(figsize=(15,8))
# Perform parallel coordinate plot
parallel_coordinates(data_norm, 'scoreClass',color=['darkorange','black'])
plt.show()
Profiles that are interested in the caravan insurance have these social demographic caracteritics:
x = scores.loc[:,'PO47':'PO44']
x.columns=[ 'Contribution car policies','Contribution boat policies',
'Contribution fire policies','Contribution private third party insurance']
#delta['cluster']=cluster_labels
y=scores["scoreClass"]
X_norm = (x - x.min())/(x.max() - x.min())
# Select features to include in the plot
plot_feat = ['Contribution car policies','Contribution boat policies',
'Contribution fire policies','Contribution private third party insurance']
# Concat classes with the normalized data
data_norm = pd.concat([X_norm[plot_feat], y], axis=1)
plt.figure(figsize=(15,8))
# Perform parallel coordinate plot
parallel_coordinates(data_norm, 'scoreClass',color=['darkorange','black'])
plt.show()
Profiles that are interested in the caravan insurance have these product related caracteritics:
- We delivered diffrent models that maximize the auc score which means are accurate the most in identifying positive potentiel clients but with each model there is a different cost: the rate of misclassifying the other class.So deciding which model to use is up to the company marketng service.
- All these models can be optimized by investing more time in fine tuning their paramaters or in choosing the best ratio of resampling.